{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 673,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "% reset -f"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from numpy.random import seed\n",
    "seed(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5814_8</td>\n",
       "      <td>1</td>\n",
       "      <td>With all this stuff going down at the moment w...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2381_9</td>\n",
       "      <td>1</td>\n",
       "      <td>\\The Classic War of the Worlds\\\" by Timothy Hi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7759_3</td>\n",
       "      <td>0</td>\n",
       "      <td>The film starts with a manager (Nicholas Bell)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3630_4</td>\n",
       "      <td>0</td>\n",
       "      <td>It must be assumed that those who praised this...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9495_8</td>\n",
       "      <td>1</td>\n",
       "      <td>Superbly trashy and wondrously unpretentious 8...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id  sentiment                                             review\n",
       "0  5814_8          1  With all this stuff going down at the moment w...\n",
       "1  2381_9          1  \\The Classic War of the Worlds\\\" by Timothy Hi...\n",
       "2  7759_3          0  The film starts with a manager (Nicholas Bell)...\n",
       "3  3630_4          0  It must be assumed that those who praised this...\n",
       "4  9495_8          1  Superbly trashy and wondrously unpretentious 8..."
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "import gensim\n",
    "import pandas as pd\n",
    "\n",
    "data=pd.read_table(\"E:/bow/labeledTrainData.tsv\",encoding='latin1')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "from nltk.corpus import stopwords\n",
    "from inflection import singularize\n",
    "def preprocess(text):\n",
    "    text=text.lower()\n",
    "    text=re.sub(\"[^a-zA-Z]+\",\" \",text)\n",
    "    text2=text.split()\n",
    "    text3=' '.join([word for word in text2])\n",
    "    return(text3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'i like this'"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text='I like this'\n",
    "preprocess(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 3)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data['review']=data['review'].apply(preprocess)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5814_8</td>\n",
       "      <td>1</td>\n",
       "      <td>with all this stuff going down at the moment w...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2381_9</td>\n",
       "      <td>1</td>\n",
       "      <td>the classic war of the worlds by timothy hines...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7759_3</td>\n",
       "      <td>0</td>\n",
       "      <td>the film starts with a manager nicholas bell g...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3630_4</td>\n",
       "      <td>0</td>\n",
       "      <td>it must be assumed that those who praised this...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9495_8</td>\n",
       "      <td>1</td>\n",
       "      <td>superbly trashy and wondrously unpretentious s...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id  sentiment                                             review\n",
       "0  5814_8          1  with all this stuff going down at the moment w...\n",
       "1  2381_9          1  the classic war of the worlds by timothy hines...\n",
       "2  7759_3          0  the film starts with a manager nicholas bell g...\n",
       "3  3630_4          0  it must be assumed that those who praised this...\n",
       "4  9495_8          1  superbly trashy and wondrously unpretentious s..."
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "t=data['review'].tolist()\n",
    "t2=t\n",
    "for i in range(len(t)):\n",
    "    t2[i]=t[i].split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\\\n",
    "    level=logging.INFO)\n",
    "\n",
    "# Set values for various parameters\n",
    "num_features = 100    # Word vector dimensionality                      \n",
    "min_word_count = 50   # Minimum word count                        \n",
    "num_workers = 4       # Number of threads to run in parallel\n",
    "context = 9         # Context window size                                                                                    \n",
    "downsampling = 1e-4   # Downsample setting for frequent words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training model...\n"
     ]
    }
   ],
   "source": [
    "from gensim.models import word2vec\n",
    "print(\"Training model...\")\n",
    "w2v_model = word2vec.Word2Vec(t2, workers=num_workers, \\\n",
    "            size=num_features, min_count = min_word_count, \\\n",
    "            window = context, sample = downsampling)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Admin\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
      "  if __name__ == '__main__':\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[(u'giving', 0.7667176723480225),\n",
       " (u'deserve', 0.7130126357078552),\n",
       " (u'ruin', 0.6599821448326111),\n",
       " (u'make', 0.6377612352371216),\n",
       " (u'guarantee', 0.6223663091659546),\n",
       " (u'gave', 0.6218616962432861),\n",
       " (u'rate', 0.6184169054031372),\n",
       " (u'spoil', 0.607461154460907),\n",
       " (u'hope', 0.6068723797798157),\n",
       " (u'get', 0.601472795009613)]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w2v_model.most_similar('give')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
